In [21]:
# Feature Extraction with Univariate Statistical Tests (Chi-squared for classification)
import pandas as pd
#from pandas import read_csv
from numpy import set_printoptions
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
In [51]:
# load data
filename = 'pima-indians-diabetes.data.csv'
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
dataframe = pd.read_csv(filename, names=names)
#print(dataframe.columns)
In [23]:
array = dataframe.values
X = array[:,0:8]
Y = array[:,8]
In [24]:
# feature selection (4 features)
kbest = SelectKBest(score_func=chi2, k=4)
In [26]:
fit = kbest.fit(X, Y)
In [64]:
# Get indices of most important features
# This must be called after 'fit' method
idxs_selected = kbest.get_support(indices=True)
print(idxs_selected)
# print the names of the cols represented by the indices
colnms = dataframe.columns
for colidx in idxs_selected:
colnm = colnms[colidx]
print(colnm)
In [28]:
# summarize scores
set_printoptions(precision=3)
print(fit.scores_)
In [29]:
fit_features = fit.transform(X)
In [50]:
dataframe_new = pd.DataFrame(fit_features, columns=new_features)
dataframe_new.head(5)
Out[50]:
In [31]:
# summarize selected features
print(fit_features[0:5,:])
In [1]:
# Feature Extraction with RFE
from pandas import read_csv
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
In [4]:
# load data
url = "pima-indians-diabetes.data.csv"
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
dataframe = read_csv(url, names=names)
colnms = dataframe.columns
array = dataframe.values
X = array[:,0:8]
Y = array[:,8]
In [5]:
# feature extraction
model = LogisticRegression()
rfe = RFE(model, 4) # Top 4
fit = rfe.fit(X, Y)
print("Num Features: ", fit.n_features_)
print("Selected Features: ", fit.support_)
print("Column names: ", colnms)
print("Feature Ranking: ", fit.ranking_)
In [66]:
# Feature Importance with Extra Trees Classifier
from pandas import read_csv
from sklearn.ensemble import ExtraTreesClassifier
In [67]:
# load data
url = "pima-indians-diabetes.data.csv"
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
dataframe = read_csv(url, names=names)
array = dataframe.values
X = array[:,0:8]
Y = array[:,8]
In [68]:
# feature extraction
model = ExtraTreesClassifier()
model.fit(X, Y)
# Larger scores indicate greater feature importance
print(model.feature_importances_)